import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
/var/folders/1c/900rr4bd26n5443n1_vyhrmc0000gn/T/ipykernel_4006/4099310058.py:4: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython.display
  from IPython.core.display import display, HTML
df = pd.read_csv('credit.csv')
df
оhchecking_balance months_loan_duration credit_history purpose amount savings_balance employment_duration percent_of_income years_at_residence age other_credit housing existing_loans_count job dependents phone default
0 < 0 DM 6 critical furniture/appliances 1169 unknown > 7 years 4 4 67 none own 2 skilled 1 yes no
1 1 - 200 DM 48 good furniture/appliances 5951 < 100 DM 1 - 4 years 2 2 22 none own 1 skilled 1 no yes
2 unknown 12 critical education 2096 < 100 DM 4 - 7 years 2 3 49 none own 1 unskilled 2 no no
3 < 0 DM 42 good furniture/appliances 7882 < 100 DM 4 - 7 years 2 4 45 none other 1 skilled 2 no no
4 < 0 DM 24 poor car 4870 < 100 DM 1 - 4 years 3 4 53 none other 2 skilled 2 no yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 unknown 12 good furniture/appliances 1736 < 100 DM 4 - 7 years 3 4 31 none own 1 unskilled 1 no no
996 < 0 DM 30 good car 3857 < 100 DM 1 - 4 years 4 4 40 none own 1 management 1 yes no
997 unknown 12 good furniture/appliances 804 < 100 DM > 7 years 4 4 38 none own 1 skilled 1 no no
998 < 0 DM 45 good furniture/appliances 1845 < 100 DM 1 - 4 years 4 4 23 none other 1 skilled 1 yes yes
999 1 - 200 DM 45 critical car 4576 100 - 500 DM unemployed 3 4 27 none own 1 skilled 1 no no

1000 rows × 17 columns

Data cleanup, steps:#

1) Renaming of columns#

2) Removing unnecessary columns#

3) Replacing values in columns#

# Renaming of columns
df = df.rename(columns = {'оhchecking_balance':'Checking_balance'})
df = df.rename(columns = {'months_loan_duration':'Loan_duration'})
df = df.rename(columns = {'credit_history':'Credit_history'})
df = df.rename(columns = {'purpose':'Purpose'})
df = df.rename(columns = {'amount':'Amount'})
df = df.rename(columns = {'savings_balance':'Savings_balance'})
df = df.rename(columns = {'employment_duration':'Employment_duration'})
df = df.rename(columns = {'percent_of_income':'Percent_of_income'})
df = df.rename(columns = {'years_at_residence':'Years_at_residrnce'})
df = df.rename(columns = {'age':'Age'})
df = df.rename(columns = {'other_credit':'Other_credit'})
df = df.rename(columns = {'housing':'Housing'})
df = df.rename(columns = {'existing_loans_count':'Existing_loans_count'})
df = df.rename(columns = {'job':'Job'})
df = df.rename(columns = {'dependents':'Dependents'})
df = df.rename(columns = {'phone':'Phone'})
df = df.rename(columns = {'default':'Default'})
df
Checking_balance Loan_duration Credit_history Purpose Amount Savings_balance Employment_duration Percent_of_income Years_at_residrnce Age Other_credit Housing Existing_loans_count Job Dependents Phone Default
0 < 0 DM 6 critical furniture/appliances 1169 unknown > 7 years 4 4 67 none own 2 skilled 1 yes no
1 1 - 200 DM 48 good furniture/appliances 5951 < 100 DM 1 - 4 years 2 2 22 none own 1 skilled 1 no yes
2 unknown 12 critical education 2096 < 100 DM 4 - 7 years 2 3 49 none own 1 unskilled 2 no no
3 < 0 DM 42 good furniture/appliances 7882 < 100 DM 4 - 7 years 2 4 45 none other 1 skilled 2 no no
4 < 0 DM 24 poor car 4870 < 100 DM 1 - 4 years 3 4 53 none other 2 skilled 2 no yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 unknown 12 good furniture/appliances 1736 < 100 DM 4 - 7 years 3 4 31 none own 1 unskilled 1 no no
996 < 0 DM 30 good car 3857 < 100 DM 1 - 4 years 4 4 40 none own 1 management 1 yes no
997 unknown 12 good furniture/appliances 804 < 100 DM > 7 years 4 4 38 none own 1 skilled 1 no no
998 < 0 DM 45 good furniture/appliances 1845 < 100 DM 1 - 4 years 4 4 23 none other 1 skilled 1 yes yes
999 1 - 200 DM 45 critical car 4576 100 - 500 DM unemployed 3 4 27 none own 1 skilled 1 no no

1000 rows × 17 columns

# Removing unnecessary columns
df.drop(columns = ['Dependents'], inplace = True)
df.drop(columns = ['Phone'], inplace = True)
df
Checking_balance Loan_duration Credit_history Purpose Amount Savings_balance Employment_duration Percent_of_income Years_at_residrnce Age Other_credit Housing Existing_loans_count Job Default
0 < 0 DM 6 critical furniture/appliances 1169 unknown > 7 years 4 4 67 none own 2 skilled no
1 1 - 200 DM 48 good furniture/appliances 5951 < 100 DM 1 - 4 years 2 2 22 none own 1 skilled yes
2 unknown 12 critical education 2096 < 100 DM 4 - 7 years 2 3 49 none own 1 unskilled no
3 < 0 DM 42 good furniture/appliances 7882 < 100 DM 4 - 7 years 2 4 45 none other 1 skilled no
4 < 0 DM 24 poor car 4870 < 100 DM 1 - 4 years 3 4 53 none other 2 skilled yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 unknown 12 good furniture/appliances 1736 < 100 DM 4 - 7 years 3 4 31 none own 1 unskilled no
996 < 0 DM 30 good car 3857 < 100 DM 1 - 4 years 4 4 40 none own 1 management no
997 unknown 12 good furniture/appliances 804 < 100 DM > 7 years 4 4 38 none own 1 skilled no
998 < 0 DM 45 good furniture/appliances 1845 < 100 DM 1 - 4 years 4 4 23 none other 1 skilled yes
999 1 - 200 DM 45 critical car 4576 100 - 500 DM unemployed 3 4 27 none own 1 skilled no

1000 rows × 15 columns

# Replacing values in columns
df['Purpose'] = df['Purpose'].str.replace('car0', 'car', regex=False)
df['Checking_balance'] = df['Checking_balance'].str.replace('unknown', '>= 0 DM', regex=False)
df['Savings_balance'] = df['Savings_balance'].str.replace('unknown', '>= 0 DM', regex=False)
df['Employment_duration'] = df['Employment_duration'].str.replace('unemployed', '0 years', regex=False)

Result of Data cleanup:#

df
Checking_balance Loan_duration Credit_history Purpose Amount Savings_balance Employment_duration Percent_of_income Years_at_residrnce Age Other_credit Housing Existing_loans_count Job Default
0 < 0 DM 6 critical furniture/appliances 1169 >= 0 DM > 7 years 4 4 67 none own 2 skilled no
1 1 - 200 DM 48 good furniture/appliances 5951 < 100 DM 1 - 4 years 2 2 22 none own 1 skilled yes
2 >= 0 DM 12 critical education 2096 < 100 DM 4 - 7 years 2 3 49 none own 1 unskilled no
3 < 0 DM 42 good furniture/appliances 7882 < 100 DM 4 - 7 years 2 4 45 none other 1 skilled no
4 < 0 DM 24 poor car 4870 < 100 DM 1 - 4 years 3 4 53 none other 2 skilled yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 >= 0 DM 12 good furniture/appliances 1736 < 100 DM 4 - 7 years 3 4 31 none own 1 unskilled no
996 < 0 DM 30 good car 3857 < 100 DM 1 - 4 years 4 4 40 none own 1 management no
997 >= 0 DM 12 good furniture/appliances 804 < 100 DM > 7 years 4 4 38 none own 1 skilled no
998 < 0 DM 45 good furniture/appliances 1845 < 100 DM 1 - 4 years 4 4 23 none other 1 skilled yes
999 1 - 200 DM 45 critical car 4576 100 - 500 DM 0 years 3 4 27 none own 1 skilled no

1000 rows × 15 columns

Descriptive statistics:#

# Statistics include mean, median and standard deviation of the 'Age' column
print('Average age:',df['Age'].mean())
print('Median of age:',df['Age'].median())
print('Standard deviation of age:',df['Age'].std())
mean_age = df['Age'].mean()
median_age = df['Age'].median()
std_age = df['Age'].std()
categories = ['Mean', 'Median', 'Standart devitation']
values = [mean_age, median_age, std_age]
plt.figure(figsize=(8, 6)) 
plt.bar(categories, values) 
plt.ylabel('Number meaning')
plt.title('Statistics of Age')
plt.show()
Average age: 35.546
Median of age: 33.0
Standard deviation of age: 11.375468574317512
_images/92e1cee5addab3d0ebfb4c6e828be8c70bbce7a5debedfbca62691b2aa272793.png
# Statistics include mean, median and standard deviation of the 'Amount' column
print('Average amount:',df['Amount'].mean())
print('Median of amount:',df['Amount'].median())
print('Standard deviation of amount:',df['Amount'].std())
mean_amount = df['Amount'].mean()
median_amount = df['Amount'].median()
std_amount = df['Amount'].std()
categories = ['Mean', 'Median', 'Standart devitation']
values = [mean_amount, median_amount, std_amount]
plt.figure(figsize=(8, 6)) 
plt.bar(categories, values) 
plt.ylabel('Number meaning')
plt.title('Statistics of Amount')
plt.show()
Average amount: 3271.258
Median of amount: 2319.5
Standard deviation of amount: 2822.7368759604406
_images/296073cdcf09fea0c65d5f4416c804de01808164c350a2cddf6ceb84588c0302.png
# Statistics include mean, median and standard deviation of the 'Loan duration' column
print('Average duration:',df['Loan_duration'].mean())
print('Median of duration:',df['Loan_duration'].median())
print('Standard deviation of duration:',df['Loan_duration'].std())
mean_LD = df['Loan_duration'].mean()
median_LD = df['Loan_duration'].median()
std_LD = df['Loan_duration'].std()
categories = ['Mean', 'Median', 'Standart devitation']
values = [mean_LD, median_LD, std_LD]
plt.figure(figsize=(8, 6)) 
plt.bar(categories, values) 
plt.ylabel('Number meaning')
plt.title('Statistics of Loan duration')
plt.show()
Average duration: 20.903
Median of duration: 18.0
Standard deviation of duration: 12.058814452756375
_images/844d0089fc8ca457c4bbc612cba7e1228a4794c6a00d0c78cc5d6747bc1487f5.png
!pip install plotly
Requirement already satisfied: plotly in ./.venv/lib/python3.10/site-packages (5.24.1)
Requirement already satisfied: tenacity>=6.2.0 in ./.venv/lib/python3.10/site-packages (from plotly) (9.0.0)
Requirement already satisfied: packaging in ./.venv/lib/python3.10/site-packages (from plotly) (24.2)
[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: pip install --upgrade pip

Plots#

# Plot of amount
plt.figure(figsize=(10, 6))
plt.hist(df[df['Amount'] < 50000]['Amount'], bins=100, color='skyblue', edgecolor='black')
plt.title('Amount', fontsize=16)
plt.xlabel('Deutschmarks', fontsize=12)
plt.ylabel('Number of people', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
_images/16e83be3e996c331c3732d56d4b6686530252610a7709f841ac015114e504d22.png
# Plot of purpose
labels = df.sort_values(by='Purpose').Purpose.unique()
values = df.sort_values(by='Purpose').Purpose.value_counts(sort=False)
fig, ax = plt.subplots()
ax.pie(values, labels=labels)
plt.show()
_images/6df595e3a419dc4bcb067031bfa9af49c0a0bcc5168afd8d160ef35b1b0242dc.png
# Quality of credit history
x = df.sort_values(by='Credit_history').Credit_history.unique()
y = df.sort_values(by='Credit_history').Credit_history.value_counts(sort=False)
fig, ax = plt.subplots()
ax.bar(x, y)
<BarContainer object of 5 artists>
_images/ef1add5c16c2218f62f43b66100ce5480ff047be9f3bf98889c13fcbaca4b522.png
#Distribution of borrowers ages
plt.figure(figsize=(8, 6))
plt.hist(df['Age'], bins=10, edgecolor='black', alpha=0.7)
plt.xlabel('Age')
plt.ylabel('Number of people')
plt.title('Distribution of borrowers ages')
plt.show()
_images/71150fd61c08bb269582f8e8a449099ab98f0c4458819250c084206ddf926860.png
plt.figure(figsize=(8,6))
credit_history_counts = df.groupby('Credit_history')['Default'].value_counts().unstack()
credit_history_counts.plot(kind='bar', stacked=True)
plt.xlabel('Credit history')
plt.ylabel('Number of defaults')
plt.title('Frequency of defaults depending on credit history')
plt.legend(['No default', 'Default'])
plt.show()
<Figure size 800x600 with 0 Axes>
_images/68cf4ef48124271fed5bc4b739a16c7bfbe71d26798e910feb027f466ce13a73.png
# Graph of Purpose preferences by job
import pandas as pd
import plotly.express as px
print('Purposes among skilled people')
condition_1 = df['Job'] == 'skilled'
filtered_df = df[condition_1]
value_counts = filtered_df['Purpose'].value_counts()
print(value_counts.to_string())
print('-----------------------------')
print('Purposes among management people')
condition_1 = df['Job'] == 'management'
filtered_df = df[condition_1]
value_counts = filtered_df['Purpose'].value_counts()
print(value_counts.to_string())
print('-----------------------------')
print('Purposes among unskilled people')
condition_1 = df['Job'] == 'unskilled'
filtered_df = df[condition_1]
value_counts = filtered_df['Purpose'].value_counts()
print(value_counts.to_string())
print('-----------------------------')
print('Purposes among unemployed people')
condition_1 = df['Job'] == 'unemployed'
filtered_df = df[condition_1]
value_counts = filtered_df['Purpose'].value_counts()
print(value_counts.to_string())

difficulty_counts = df.groupby(['Job', 'Purpose']).size().reset_index(name='Count')

# Create a grouped bar chart for Purpose preferences by Job
fig = px.bar(difficulty_counts, 
             x='Job', 
             y='Count', 
             color='Purpose', 
             title="Purpose preferences by Job",
             labels={'Purpose': 'Purpose', 'Count': 'Number of people'},
             barmode='group', 
            color_discrete_map={'skilled': '#FF0000', 'unskulled': '#FFD700', 'management': '#00FFFF', 'uneployed': '#ADFF2F'})  # Grouped bars for comparison

# Show the plot
fig.update_yaxes(dtick=50)
fig.show()
fig.write_html('g1.html')
display(HTML('g1.html'))
Purposes among skilled people
furniture/appliances    331
car                     191
business                 60
education                35
renovations              13
-----------------------------
Purposes among management people
car                     78
furniture/appliances    47
business                15
education                8
-----------------------------
Purposes among unskilled people
furniture/appliances    91
car                     67
business                20
education               15
renovations              7
-----------------------------
Purposes among unemployed people
car                     13
furniture/appliances     4
renovations              2
business                 2
education                1
plt.figure(figsize=(10, 6))
for history in df['Credit_history'].unique():
    subset = df[df['Credit_history'] == history]
    plt.scatter(subset['Age'], subset['Amount'], label=f'Credit History: {history}', marker='o', s=50)


plt.xlabel('Age')
plt.ylabel('Loan amount')
plt.title('Relationship between age, loan amount and credit histoty')
plt.legend()
plt.grid(True)
plt.show()
_images/f9cb5af983f5847549035d02aaa57af6847d457d430bede7d753550dc6ea49ef.png

A hypothesis check:#

Borrowes with longer work experience are less likely to default on the loan#

labels = df.sort_values(by='Employment_duration').Employment_duration.unique()
values = df.sort_values(by='Employment_duration').Employment_duration.value_counts(sort=False)
fig, ax = plt.subplots()
ax.pie(values, labels=labels)
plt.show()
_images/a2fc707b5cca17759ac2ea307ede0ce7111faaca189b4f9959d0400c1e61bc52.png
import pandas as pd

df['Default'] = df['Default'].map({'no': 0, 'yes': 1})

employment_mapping = {'0 years': 0, '1 - 4 years': 1, '4 - 7 years': 2, '> 7 years': 3, '< 1 year':4}
df['Employment_duration'] = df['Employment_duration'].map(employment_mapping)

df['Employment_duration'].fillna(0, inplace=True)

pivot_table = pd.pivot_table(df, values='Default', index='Employment_duration', aggfunc=['mean', 'count'])
pivot_table = pivot_table.rename(columns={'mean': 'The share of defaults', 'count': 'The total number'})

pivot_table.index = pivot_table.index.map({0: '0 years', 1: '1-4 years', 2: '4-7 years', 3: '>7 yeras', 4:'<1 year'})

print(pivot_table)
                    The share of defaults The total number
                                  Default          Default
Employment_duration                                       
0 years                          0.370968               62
1-4 years                        0.306785              339
4-7 years                        0.224138              174
>7 yeras                         0.252964              253
<1 year                          0.406977              172
plt.figure(figsize=(10, 6))
plt.hist(df[df['Default'] == 0]['Employment_duration'], alpha=0.7, label='No default', bins=4)
plt.hist(df[df['Default'] == 1]['Employment_duration'], alpha=0.7, label='Default', bins=4)
plt.xlabel('Employment duration')
plt.ylabel('Number of borrowers')
plt.title('Distribution of work experience for groups with and without default')
plt.legend()
plt.xticks(np.arange(4), ['Unemployed', '1-4 years', '4-7 years', '>7 years'])
plt.show()
_images/ff9891b877dffc43371bbc2c8050bfd080e043bf46740f6eb1f069d0b053d618.png

Data transformation.#

import pandas as pd
df['Employment_duration'].fillna('Unknown', inplace=True)
df['Job'].fillna('Unknown', inplace=True)
df['Employment_duration'] = df['Employment_duration'].astype(str)
df['Job'] = df['Job'].astype(str)
df['Employment_info'] = df['Employment_duration'] + ' - ' + df['Job']
print(df[['Employment_duration', 'Job', 'Employment_info']].head())
  Employment_duration        Job Employment_info
0                   3    skilled     3 - skilled
1                   1    skilled     1 - skilled
2                   2  unskilled   2 - unskilled
3                   2    skilled     2 - skilled
4                   1    skilled     1 - skilled
df.drop(columns = ['Employment_duration'], inplace = True)
df.drop(columns = ['Job'], inplace = True)
df
Checking_balance Loan_duration Credit_history Purpose Amount Savings_balance Percent_of_income Years_at_residrnce Age Other_credit Housing Existing_loans_count Default Employment_info
0 < 0 DM 6 critical furniture/appliances 1169 >= 0 DM 4 4 67 none own 2 0 3 - skilled
1 1 - 200 DM 48 good furniture/appliances 5951 < 100 DM 2 2 22 none own 1 1 1 - skilled
2 >= 0 DM 12 critical education 2096 < 100 DM 2 3 49 none own 1 0 2 - unskilled
3 < 0 DM 42 good furniture/appliances 7882 < 100 DM 2 4 45 none other 1 0 2 - skilled
4 < 0 DM 24 poor car 4870 < 100 DM 3 4 53 none other 2 1 1 - skilled
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 >= 0 DM 12 good furniture/appliances 1736 < 100 DM 3 4 31 none own 1 0 2 - unskilled
996 < 0 DM 30 good car 3857 < 100 DM 4 4 40 none own 1 0 1 - management
997 >= 0 DM 12 good furniture/appliances 804 < 100 DM 4 4 38 none own 1 0 3 - skilled
998 < 0 DM 45 good furniture/appliances 1845 < 100 DM 4 4 23 none other 1 1 1 - skilled
999 1 - 200 DM 45 critical car 4576 100 - 500 DM 3 4 27 none own 1 0 0 - skilled

1000 rows × 14 columns

df['Credit_history'] = df['Credit_history'].astype(str)
df['Percent_of_income'] = df['Percent_of_income'].astype(str)

# Объединяем столбцы с разделителем:
df['Credit_Income_Info'] = df['Credit_history'] + ' - ' + df['Percent_of_income']

print(df[['Credit_history', 'Percent_of_income', 'Credit_Income_Info']].head())
  Credit_history Percent_of_income Credit_Income_Info
0       critical                 4       critical - 4
1           good                 2           good - 2
2       critical                 2       critical - 2
3           good                 2           good - 2
4           poor                 3           poor - 3
df.drop(columns = ['Credit_history'], inplace = True)
df.drop(columns = ['Percent_of_income'], inplace = True)

Result of Data transformation#

df
Checking_balance Loan_duration Purpose Amount Savings_balance Years_at_residrnce Age Other_credit Housing Existing_loans_count Default Employment_info Credit_Income_Info
0 < 0 DM 6 furniture/appliances 1169 >= 0 DM 4 67 none own 2 0 3 - skilled critical - 4
1 1 - 200 DM 48 furniture/appliances 5951 < 100 DM 2 22 none own 1 1 1 - skilled good - 2
2 >= 0 DM 12 education 2096 < 100 DM 3 49 none own 1 0 2 - unskilled critical - 2
3 < 0 DM 42 furniture/appliances 7882 < 100 DM 4 45 none other 1 0 2 - skilled good - 2
4 < 0 DM 24 car 4870 < 100 DM 4 53 none other 2 1 1 - skilled poor - 3
... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 >= 0 DM 12 furniture/appliances 1736 < 100 DM 4 31 none own 1 0 2 - unskilled good - 3
996 < 0 DM 30 car 3857 < 100 DM 4 40 none own 1 0 1 - management good - 4
997 >= 0 DM 12 furniture/appliances 804 < 100 DM 4 38 none own 1 0 3 - skilled good - 4
998 < 0 DM 45 furniture/appliances 1845 < 100 DM 4 23 none other 1 1 1 - skilled good - 4
999 1 - 200 DM 45 car 4576 100 - 500 DM 4 27 none own 1 0 0 - skilled critical - 3

1000 rows × 13 columns